home *** CD-ROM | disk | FTP | other *** search
- /*=======================================================================*/
- /* File Name : 'geturl.rexx', 02-Jan-95 */
- /*=======================================================================*/
- /* Script to download HTML systems across the network */
- /*
- Written by James Burton (burton@cs.latrobe.edu.au)
- 02-Jan-95
-
- **NOTE: requires AmiTCP & TCP: device**
- */
-
- gv.version = 'v1.03'
- version_string = '$VER: GetURL.rexx'
-
- /*=======================================================================*/
- /* Global Variables */
- /*=======================================================================*/
-
- /* Constants (huh?) */
- gv.cr = '0d'x
- gv.lf = '0a'x
- gv.crlf = gv.cr||gv.lf
- gv.true = 1
- gv.false = 0
-
- /* Proxy stuff */
- gv.http_proxy_host = ''
- gv.http_proxy_port = '80'
- gv.ftp_proxy_host = ''
- gv.ftp_proxy_port = '80'
-
- /* Root Url */
- gv.root_url = ''
- gv.root_method = ''
- gv.root_host = ''
- gv.root_port = ''
- gv.root_path = ''
-
- /* Files */
- gv.input_file = ''
- gv.output_file = ''
- gv.agenda_file = 't:agenda-'||tmpname()
- gv.prune_file = 't:prune-'||tmpname()
- gv.header_file = 't:header-'||tmpname()
- gv.failed_file = 't:failed-'||tmpname()
-
- /* Flags */
- gv.recursive_fetch = gv.false
- gv.use_proxy_cache = gv.true
- gv.retry_failed = gv.false
- gv.use_match = gv.false
- gv.save_headers = gv.false /* Change the default to true for Brian :-) */
- gv.use_modified_since = gv.false
- gv.use_associative = gv.false
- gv.use_level_marker = gv.false
-
- /* Misc */
- gv.local_prefix = 'file://localhost|Mosaic:'
- gv.match_host = ''
- gv.match_path = ''
- gv.match_url = ''
- gv.user_address = ''
- gv.number_iterations = ''
- gv.depth = 0 /* bt */
- gv.max_depth = 20 /* bt */
- gv.max_length = '' /* bt */
- gv.timezone = '+1100'
- gv.problem_address = 'burton@latcs1.lat.oz.au'
- gv.delay = 2
- gv.level_marker = '-'
-
- /* arrays to hold agenda items */
- /* use:
- agenda. defaults to 0
- agenda.head contains index of 1st path yet-to-visit
- agenda.tail contains index of last path yet-to-visit
- agenda.N contains Nth path yet-to-visit
- agenda.pathname contains recursion-level of path-yet-to-visit
- prune.tail contains index of next visited-path
- prune.N contains name of a visited-path
- prune.pathname contains non-zero
- failed.tail contains index of next failed-path
- failed.N contains name of a failed path
- failed.pathname contains non-zero
- */
- agenda. = 0
- agenda.head = 1
- agenda.tail = 0
- prune. = 0
- prune.tail = 0
- failed. = 0
- failed.tail = 0
-
- /*=======================================================================*/
- /* Main */
- /*=======================================================================*/
-
- Main:
- parse arg gv.root_url rest
-
- if substr(gv.root_url,1,1) = '-' then do /* no URL */
- rest = gv.root_url rest
- gv.root_url = ''
- end
-
- signal ON break_c /* install control-c handler */
-
- OpenLibrary('rexxsupport.library',gv.true) /* insist on using this library */
- OpenLibrary('rexxarplib.library') /* non essential library */
-
- Configure() /* setup global variables */
-
- /* look for and parse arguments */
- arglist = rest
- do while arglist ~= ''
- parse var arglist flagname arglist
- uflagname = upper(flagname)
- DoArguments(uflagname)
- end
-
- if gv.root_url ~= '' then
- do
- if gv.input_file = '' then
- do
- /* parse Root URL */
- parse value ParseUrl(gv.root_url) with ok','url','gv.root_method','gv.root_host','gv.root_port','gv.root_path
- if ~ok then do
- say 'failed to parse initial URL:' gv.root_url
- EXIT 20
- end
- if gv.output_file = '' then
- outputfile = MakePathname(gv.root_host,gv.root_path)
- else
- outputfile = gv.output_file
- if ~AppendHTTP(gv.root_host,gv.root_port,gv.root_path,outputfile,gv.agenda_file,gv.root_url) then
- AddToFailed(gv.failed_file,gv.root_url)
- else
- do
- if ~PruneUrl(gv.root_url) then
- AddToPruned(gv.prune_file,gv.root_url) /* don't want to visit the root again */
- end
- if gv.recursive_fetch then
- DoAgenda(gv.agenda_file)
- end
- else /* ignore URL, use input file */
- do
- if gv.output_file = '' then
- outputfile = 't:output-'tmpname()
- else
- outputfile = gv.output_file
- AppendFile(gv.input_file,outputfile,gv.agenda_file)
- if gv.recursive_fetch then
- DoAgenda(gv.agenda_file)
- end
- end
- else /* no URL specified */
- do
- if gv.input_file ~= '' then
- do
- if gv.output_file = '' then
- outputfile = 't:output-'tmpname()
- else
- outputfile = gv.output_file
- AppendFile(gv.input_file,outputfile,gv.agenda_file)
- if gv.recursive_fetch then
- DoAgenda(gv.agenda_file)
- end
- else
- do
- /* no input file, no URL */
- /* may be something in the agenda */
- if gv.recursive_fetch then
- DoAgenda(gv.agenda_file)
- end
- end
-
- if gv.recursive_fetch & gv.retry_failed then do
- if gv.use_associative then
- BTAddFileTo(agenda,gv.failed_file)
- else
- address command 'copy' gv.failed_file gv.agenda_file
- address command 'delete >NIL:' gv.failed_file
- say 'Retrying previously failed URLs...'
- DoAgenda(gv.agenda_file)
- end
-
- if gv.use_associative then do
- BTDumpAgendaToFile(gv.agenda_file)
- BTDumpFailedToFile(gv.failed_file)
- end
- EXIT
-
- /*=======================================================================*/
- /* DoArguments */
- /*=======================================================================*/
-
- DoArguments:
- parse var uflagname
- select
- when abbrev('-HELP',uflagname,1) then
- printusage()
- when abbrev('-RECURSIVE',uflagname,2) then
- gv.recursive_fetch = gv.true
- when abbrev('-HOST',uflagname,2) then
- parse var arglist gv.match_host arglist
- when abbrev('-PATH',uflagname,2) then
- parse var arglist gv.match_path arglist
- when abbrev('-URL',uflagname,2) then
- do
- parse var arglist gv.match_url arglist
- parse var gv.match_url dummy'://'gv.match_host'/'gv.match_path
- end
- when abbrev('-OUTPUT',uflagname,2) then
- parse var arglist gv.output_file arglist
- when abbrev('-NOPROXY',uflagname,3) then
- do
- gv.http_proxy_host = ''
- gv.ftp_proxy_host = ''
- end
- when abbrev('-INPUT',uflagname,2) then
- do
- if arglist = '' then
- gv.input_file = '-'
- else
- do
- parse var arglist gv.input_file arglist
- if substr(gv.input_file,1,1) = '-' then do
- /* put the argument back please */
- arglist = gv.input_file arglist
- gv.input_file = '-' /* flag to use stdin */
- end
- end
- end
- when abbrev('-VISITED',uflagname,2) then
- do
- parse var arglist gv.prune_file arglist
- if gv.use_associative then
- BTAddFileTo(prune,gv.prune_file)
- end
- when abbrev('-UNVISITED',uflagname,2) then
- do
- parse var arglist gv.agenda_file arglist
- if gv.use_associative then
- BTAddFileTo(agenda,gv.agenda_file)
- end
- when abbrev('-DIRECTORY',uflagname,2) then
- say 'DIRECTORY flag not implemented yet'
- when abbrev('-UPDATE',uflagname,2) then
- say 'UPDATE flag not implemented yet'
- when abbrev('-NOPROXYCACHE',uflagname,2) then
- gv.use_proxy_cache = gv.false
- when abbrev('-PROBLEM',uflagname,2) then
- do
- writeln(stdout,'Start typing problem or bug report...<end with '.' on line by itself>')
- SendMail(stdin,gv.problem_address,'GetUrl.rexx Problem' gv.version)
- end
- when abbrev('-SAVEROOT',uflagname,2) then
- do
- parse var arglist newdir arglist
- if (right(newdir,1) ~= ':') & (right(newdir,1) ~= '/') then
- newdir = newdir'/'
- parse var gv.local_prefix before'|'after
- gv.local_prefix = before'|'newdir
- end
- when abbrev('-LENGTH',uflagname,2) then
- parse var arglist gv.max_length arglist
- when abbrev('-DEPTH',uflagname,2) then
- do
- parse var arglist gv.max_depth arglist
- gv.use_level_marker = gv.true
- end
- when abbrev('-SAVEHEADERS',uflagname,2) then
- gv.save_headers = gv.true
- when abbrev('-NUMBER',uflagname,2) then
- parse var arglist gv.number_iterations arglist
- when abbrev('-NEWVERSION',uflagname,2) then
- do
- parse var arglist newfile arglist
- if newfile = '' then do
- writech(stdout,'Destination for new version? [t:GetURL.rexx] ')
- newfile = readln(stdin)
- if newfile = '' then
- newfile = 't:GetURL.rexx'
- end
- say 'downloading new script...'
- CALL AppendHTTP('www.cs.latrobe.edu.au','','~burton/Public/GetURL.rexx',newfile,gv.agenda_file,'')
- say 'downloaded' newfile', fetching documentation...'
- p = lastpos('.',newfile)
- if p > 0 then
- newfile = left(newfile,p)
- newfile = newfile'.doc'
- CALL AppendHTTP('www.cs.latrobe.edu.au','','~burton/Public/GetURL.doc',newfile,gv.agenda_file,'')
- say 'downloaded' newfile
- end
- when abbrev('-FAILED',uflagname,2) then
- do
- parse var arglist gv.failed_file arglist
- if gv.use_associative then
- BTAddFileTo(failed,failed_file)
- end
- when abbrev('-RETRY',uflagname,2) then
- gv.retry_failed = gv.true
- when abbrev('-PATTERNMATCHING',uflagname,2) then
- do
- writech(stdout,'should I really download Match? This will allow the use of regular expression patterns. [y/n] ')
- line = readln(stdin)
- if upper(left(line,1)) = 'Y' then do
- say 'Downloading Match...'
- newfile = 't:Match'
- CALL AppendHTTP('www.cs.latrobe.edu.au','','~burton/Public/C/Match',newfile,gv.agenda_file,'')
- say 'Match is now in your T: directory'
- say 'it matches a pattern against a string and prints either yes ot no.'
- say 'e.g. Match #?.gif hello.gif --> yes'
- say ' Match #?.gif hello.lha --> no'
- say 'please install this program somewhere in your shell path.'
- end
- end
- when abbrev('-IFMODIFIED',uflagname,2) then
- gv.use_modified_since = gv.true
- when abbrev('-ASSOCIATIVE',uflagname,2) then
- gv.use_associative = gv.true
- when abbrev('-DELAY',uflagname,2) then
- parse var arglist gv.delay arglist
- otherwise
- say 'unused argument :' flagname
- end
- RETURN gv.true
-
- /*=======================================================================*/
- /* SearchLine */
- /*=======================================================================*/
-
- SearchLine: Procedure expose gv. agenda.
- parse arg buffer,prefix,agenda
-
- signal ON break_c
-
- line = buffer
- do while line ~= ''
- parse var line before method '=' '"' link '"'line
- method = upper(strip(method))
- if link ~= '' & (method = 'HREF' | method = 'SRC') then do
- AddToAgenda(agenda,CompleteUrl(prefix,link))
- end
- end
- RETURN gv.true
-
- /*=======================================================================*/
- /* CompleteUrl */
- /*=======================================================================*/
-
- CompleteUrl: Procedure expose gv.
- parse arg prefix,link
-
- if prefix = '' then /* nothing else we can do */
- RETURN link
-
- if left(link,6) = 'mailto' then
- RETURN link
-
- parse var link method '://' rest
- if rest ~= '' then /* is already complete */
- do
- RETURN link
- end
- else
- do
- parse var prefix front '|' back
- select
- when left(link,1) = '/' then /* absolute path */
- RETURN front||link
- when left(link,1) = '#' then /* internal path */
- RETURN front
- otherwise /* probably starts with '../' */
- if (right(back,1) = '/') | (back = '') then
- RETURN front'/'back||link
- else
- RETURN front'/'back'/'link
- end
- RETURN gv.true
-
- /*=======================================================================*/
- /* AddToAgenda */
- /*=======================================================================*/
-
- AddToAgenda: Procedure expose gv. agenda.
- parse arg agenda,item,nocheck
-
- if (gv.use_associative) & (nocheck = '') then
- RETURN(BTAddToAgenda(item))
-
- if nocheck = '' then do
- if AgendaSearch(item,agenda) then
- RETURN(gv.false)
- end
-
- if open(listf,agenda,'A') then
- do
- writeln(listf,item)
- close(listf)
- end
- else
- do
- if open(listf,agenda,'Write') then
- do
- writeln(listf,item)
- close(listf)
- end
- else
- say 'unable to add to agenda file:' agenda
- end
- RETURN gv.true
-
- AddToPruned: Procedure expose gv. prune.
- parse arg agenda,item
-
- if gv.use_associative then
- RETURN(BTAddToPrune(item))
-
- AddToAgenda(agenda,item)
- RETURN gv.true
-
- AddToFailed: Procedure expose gv. failed.
- parse arg agenda,item
-
- if gv.use_associative then
- RETURN(BTAddToFailed(item))
-
- AddToAgenda(agenda,item)
- RETURN gv.true
-
- /*=======================================================================*/
- /* RemoveFromAgenda */
- /*=======================================================================*/
-
- RemoveFromAgenda: Procedure expose gv.
- parse arg agenda
-
- if gv.use_associative then
- RETURN gv.true
-
- signal ON break_c
- tmpname = agenda||'.tmp'
-
- if open(listf,agenda,'Read') then do
- item = readln(listf)
- if open(tmp,tmpname,'Write') then
- do
- do while ~eof(listf)
- line = readln(listf)
- if length(line) > 0 then
- writeln(tmp,line)
- end
- close(tmp)
- close(listf)
-
- address command 'delete >NIL:' agenda
- address command 'rename' tmpname agenda
- end
- else
- do
- close(listf)
- say 'unable to remove item from agenda file:' agenda
- end
- /* ***** NOTE: listf has been closed ***** */
- end
- RETURN gv.true
-
- /*=======================================================================*/
- /* AgendaTop */
- /*=======================================================================*/
-
- AgendaTop: Procedure expose gv. agenda.
- parse arg agenda
-
- if gv.use_associative then
- RETURN(BTAgendaTop())
-
- top = ''
- if open(listf,agenda,'Read') then do
- if eof(listf) then
- top = ''
- else
- top = readln(listf)
- close(listf)
- end
- RETURN top
-
- /*=======================================================================*/
- /* AgendaSearch */
- /*=======================================================================*/
-
- AgendaSearch: Procedure expose gv.
- parse arg target,file
-
- if open(listf,file,'Read') then do
- do while ~eof(listf)
- line = readln(listf)
- if line = target then do
- close(listf)
- RETURN gv.true
- end
- end
- close(listf)
- end
- RETURN gv.false
-
- /*=======================================================================*/
- /* PruneUrl */
- /*=======================================================================*/
-
- PruneUrl: Procedure expose gv.
- parse arg url
-
- seen_previously = gv.false
- if exists(gv.prune_file) then do
- if AgendaSearch(url,gv.prune_file) then
- seen_previously = gv.true
- end
-
- RETURN seen_previously
-
- /*=======================================================================*/
- /* DoAgenda */
- /*=======================================================================*/
-
- DoAgenda: Procedure expose gv. agenda. failed. prune.
- parse arg agenda
-
- signal ON break_c
-
- count = 0
- url = AgendaTop(agenda)
- do while url ~= ''
- if left(url,1) = gv.level_marker then do
- parse var url dummy gv.depth
- RemoveFromAgenda(agenda)
- url = AgendaTop(agenda)
- iterate
- end
- parse value ParseUrl(url) with ok','url','method','host','port','path
- select
- when ~ok then
- nop
- when (gv.number_iterations ~= '') & (count >= gv.number_iterations) then
- break
- when (gv.depth >= gv.max_depth) then
- break
- when PruneUrl(url) then
- nop
- when ~MatchHost(host,gv.match_host) then
- nop
- when ~MatchPath(path,gv.match_path) then
- nop
- when method = 'HTTP' then
- do
- count = count + 1
- if gv.output_file = '' then
- fname = MakePathname(host,path)
- else
- fname = gv.output_file
- say '['right(count,4)']' url '->' fname
- if ~AppendHTTP(host,port,path,fname,agenda,url) then
- AddToFailed(gv.failed_file,url)
- else
- AddToPruned(gv.prune_file,url)
- end
- when method = 'FILE' then
- do
- /* FILE is always on localhost. Any other host uses FTP */
- count = count + 1
- say '['right(count,4)']' 'reading' url
- parse var gv.local_prefix before'|'after
- if (right(after,1) ~= ':') & (right(after,1) ~= '/') then
- after = after'/'
- if pos(':',path) = 0 then
- fname = after||path
- else
- fname = path
- if TEXTFILE_TYPE(fname) then
- AppendFile(fname,,agenda)
- AddToPruned(gv.prune_file,url)
- end
- otherwise
- nop
- end
- RemoveFromAgenda(agenda)
- address command 'wait' gv.delay
- url = AgendaTop(agenda)
- end
- RETURN gv.true
-
- /*=======================================================================*/
- /* MatchHost */
- /*=======================================================================*/
-
- MatchHost: Procedure expose gv.
- parse arg hostname,pattern
-
- if pattern = '' then
- RETURN gv.true
-
- if ~gv.use_match then
- do
- host = hostname
- patn = pattern
- do while (host ~= '') & (patn ~= '')
- parse var host hname '.' host
- parse var patn pname '.' patn
- if (hname ~= pname) & (pname ~= '*') then
- RETURN gv.false
- end
- if patn = '*' then
- RETURN gv.true
- if host ~= patn then
- RETURN gv.false
- end
- else
- do
- address command 'match >ENV:result' '"'pattern'"' '"'hostname'"'
- if GetEnvVar('result') = 'yes' then
- RET = gv.true
- else
- RET = gv.false
- RETURN RET
- end
- RETURN gv.true
-
- /*=======================================================================*/
- /* MatchPath */
- /*=======================================================================*/
-
- MatchPath: Procedure expose gv.
- parse arg pathname,pattern
-
- if pattern = '' then
- RETURN gv.true
-
- if ~gv.use_match then
- do
- path = pathname
- patn = pattern
- do while (path ~= '') & (patn ~= '')
- parse var path dname '/' path
- parse var patn pname '/' patn
- if (dname ~= pname) & (pname ~= '*') then
- RETURN gv.false
- end
- if patn = '*' then
- RETURN gv.true
- if path ~= patn then
- RETURN gv.false
- end
- else
- do
- address command 'match >ENV:result' '"'pattern'"' '"'pathname'"'
- if GetEnvVar('result') = 'yes' then
- RETURN gv.true
- else
- RETURN gv.false
- end
- RETURN gv.true
-
- /*=======================================================================*/
- /* MakePathname */
- /*=======================================================================*/
-
- MakePathname: Procedure expose gv.
- parse arg host,path
-
- parse var gv.local_prefix localhost'|'savepath
- ret = savepath||host'/'path
- if (right(path,1) = '/') | (path = '') then
- ret = ret||'index.html'
- RETURN ret
-
- /*=======================================================================*/
- /* Break_C */
- /*=======================================================================*/
-
- Break_C:
- say '*** BREAK ***'
- say 'Transfer aborted'
- if gv.use_associative then do
- BTDumpAgendaToFile(gv.agenda_file)
- BTDumpFailedToFile(gv.failed_file)
- end
- EXIT 20
-
- /*=======================================================================*/
- /* AppendHTTP */
- /*=======================================================================*/
-
- AppendHTTP: Procedure expose gv. agenda. prune. failed.
- parse arg host,port,path,filename,agenda,url
-
- signal ON break_c
-
- cache_header = ''
-
- /* check that we can actually proceed */
- if ~showlist('H','TCP') then do
- say 'TCP: device is not mounted'
- RETURN gv.false
- end
-
- if gv.output_file = '' then
- output_mode = 'Write'
- else
- do
- if ~exists(gv.output_file) then
- output_mode = 'Write'
- else
- output_mode = 'Append'
- end
-
- if right(path,1) = '/' then
- prefix_path = path
- else
- do
- lastslash = lastpos('/',path)
- if lastslash = 0 then
- prefix_path = ''
- else
- prefix_path = left(path,lastslash)
- end
-
- /* determine correct socket to open (check for proxy) */
- if gv.http_proxy_host ~= '' then
- do
- /* use proxy host */
- socketname = 'tcp:'gv.http_proxy_host'/'gv.http_proxy_port
- if port = '' then
- do
- get_command = 'GET http://'host'/'path
- prefix = 'http://'host'|'prefix_path
- end
- else
- do
- get_command = 'GET http://'host':'port'/'path
- prefix = 'http://'host':'port'|'prefix_path
- end
- end
- else
- do
- /* no proxy, so can do away with method,host,port */
- if port = '' then
- do
- socketname = 'tcp:'host'/80'
- get_command = 'GET /'path
- prefix = 'http://'host'|'prefix_path
- end
- else
- do
- socketname = 'tcp:'host'/'port
- get_command = 'GET /'path
- prefix = 'http://'host':'port'|'prefix_path
- end
- end
-
- OK = gv.false
- /* Open socket to host */
- if open(socket,socketname,'A') then
- do
- /* send FullRequest command */
- writech(socket,get_command 'HTTP/1.0'||gv.crlf)
- writech(socket,'From:' gv.user_address||gv.crlf)
- writech(socket,'User-Agent: GetUrl.rexx' gv.version 'by burton@cs.latrobe.edu.au'||gv.crlf)
- writech(socket,'Accept: */*, text/plain, text/html'||gv.crlf)
- if (gv.http_proxy_host ~= '') & (~gv.use_proxy_cache) then
- writech(socket,'Pragma: no-cache'||gv.crlf)
- if gv.use_modified_since & exists(filename) then do
- parse value statef(filename) with type len blocks prot days mins ticks comment
- /*say '<'days'/'mins'>'*/
- parse value ApplyTimeZone(days,mins) with days','mins
- /*say '<'days'/'mins'>'*/
- hr = mins % 60
- mn = mins // 60
- sc = 0
- /*dy = left(date('weekday',days),3)*/
- dy = date('weekday',days)
- parse value date('normal',days) with dd mmm yyyy
- dmy = dd'-'mmm'-'right(yyyy,2)
- dt = dy',' dmy right(hr,2,'0')':'right(mn,2,'0')':'right(sc,2,'0')
- /*say '{'dt'}'*/
- writech(socket,'If-Modified-Since:' dt||gv.crlf)
- end
- writech(socket,gv.crlf)
-
- /* read first line of response */
- if ~eof(socket) then
- checkline = readch(socket,8)
-
- OK = gv.true
- if checkline = 'HTTP/1.0' then
- do
- header = gv.false
-
- /* Deal with header */
- line = readln(socket)
- parse var line response_code rest
- if response_code = '200' then
- do
- OK = gv.true
- if gv.save_headers then
- hfile = filename'.HDF'
- else
- hfile = gv.header_file
- header = open(headf,hfile,'Write')
- end
- else
- OK = gv.false
-
- /* read off header returned */
- do while (~eof(socket)) & (line ~= '') & (line ~= gv.cr) & (line ~= gv.crlf)
- line = readln(socket)
- if header then
- writeln(headf,line)
- if gv.max_length ~= '' then do
- if upper(left(line,16)) = 'CONTENT-LENGTH: ' then do
- parse UPPER var line 'CONTENT-LENGTH: 'size
- if size > gv.max_length then do
- OK = gv.false
- response_code = 'file too large ('size' Bytes)'
- end
- end
- end
- end
-
- if header then
- close(headf)
- end
-
- if OK then
- do
- CreateDirectories(filename)
- if open(new,filename,output_mode) then
- do
- if HTMLFILE_TYPE(filename) then do
- date_string = time('Civil') date('Normal')
- original_link = '[<a href="'url'">Original</a>]'
- local_link = '[<a href="file://localhost/'filename'">Local copy</a>]'
- author_string = 'Saved by GetURL.rexx by <a href="http://www.cs.latrobe.edu.au/~burton/">James Burton</a>'
- cache_header = '<h6>Date saved : 'date_string' 'original_link' 'local_link' 'author_string'</h6>'
- writeln(new,cache_header)
- end
- if checkline ~= 'HTTP/1.0' then
- writech(new,checkline)
-
- /* byte by byte transfer of the file */
- do while ~eof(socket)
- buffer = readch(socket,4096)
- writech(new,buffer)
- end
- close(new)
- end
- else
- say 'unable to open file:' filename 'for writing'
- end
- else
- do
- say 'HTTP ERROR' response_code':' rest
- say 'COMMAND:' get_command
- OK = gv.false
- end
- close(socket)
- end
- else
- do
- say 'unable to reach host:' host
- say 'socket:' socketname
- OK = gv.false
- end
- if OK then
- SearchFile(filename,prefix,agenda,length(cache_header))
- RETURN OK
-
- /*=======================================================================*/
- /* ApplyTimeZone */
- /*=======================================================================*/
-
- ApplyTimeZone: Procedure expose gv.
- parse arg days,mins
-
- sign = left(gv.timezone,1)
- offset = strip(substr(gv.timezone,2))
- mn_offset = right(offset,2)
- hr_offset = left(offset,length(offset)-2)
-
- hr = mins % 60
- mn = mins // 60
-
- if sign = '+' then
- do /* subtract the offset */
- if hr_offset > hr then do
- days = days - 1
- hr = hr + 24
- end
- hr = hr - hr_offset
-
- if mn_offset > mn then do
- hr = hr - 1
- mn = mn + 60
- end
- mn = mn - mn_offset
- end
- else
- do /* add the offset */
- hr = hr + hr_offset
- if hr > 24 then do
- days = days + 1
- hr = hr - 24
- end
-
- mn = mn + mn_offset
- if mn > 60 then do
- hr = hr + 1
- mn = mn - 60
- end
- end
-
- mins = hr * 60 + mn
- RETURN days','mins
-
- /*=======================================================================*/
- /* SearchFile */
- /*=======================================================================*/
-
- SearchFile: Procedure expose gv. agenda. prune.
- parse arg filename,prefix,agenda,skip_length
-
- signal ON break_c
-
- /* now search it */
- if HTMLFILE_TYPE(filename) then do
- if open(new,filename,'Read') then do
- if skip_length > 0 then
- readch(new,skip_length) /* skip the top of the file */
- do while ~eof(new)
- buffer = readln(new)
- SearchLine(buffer,prefix,agenda)
- end
- if (gv.use_level_marker) & (~gv.use_associative) then
- AddToAgenda(agenda,gv.level_marker gv.depth+1,'force')
- close(new)
- end
- end
- RETURN gv.true
-
- /*=======================================================================*/
- /* CreateDirectories */
- /*=======================================================================*/
-
- CreateDirectories: Procedure expose gv.
- parse arg filename
-
- signal ON break_c
-
- parse var filename dev ':' path
- current_dir = dev':'
- do while path ~= ''
- parse var path dir '/' path
- if right(current_dir,1) = ':' then
- current_dir = current_dir||dir
- else
- current_dir = current_dir'/'dir
- if path ~= '' then
- if ~exists(current_dir) then
- address command 'makedir' current_dir
- end
- RETURN gv.true
-
- /*=======================================================================*/
- /* HTMLFILE_TYPE */
- /*=======================================================================*/
-
- HTMLFILE_TYPE: Procedure expose gv.
- parse arg path
-
- dot = lastpos('.',path)
- if dot = 0 then
- do
- if right(path,1) = '/' then
- RETURN gv.true
- else
- RETURN gv.false
- end
- else
- do
- extension = upper(substr(path,dot,4))
- if extension = '.HTM' then
- RETURN gv.true
- else
- RETURN gv.false
- end
- RETURN gv.true
-
- /*=======================================================================*/
- /* TEXTFILE_TYPE */
- /*=======================================================================*/
-
- TEXTFILE_TYPE: Procedure expose gv.
- parse arg path
-
- dot = lastpos('.',path)
- if dot = 0 then
- do
- if right(path,1) = '/' then
- RETURN gv.true
- else
- RETURN gv.false
- end
- else
- do
- extension = upper(substr(path,dot,4))
- if (extension = '.TXT') | (extension = '.DOC') | (extension = '.TEX') | (extension = '.HTM') then
- RETURN gv.true
- else
- RETURN gv.false
- end
- RETURN gv.true
-
- /*=======================================================================*/
- /* AppendFile */
- /*=======================================================================*/
-
- AppendFile: Procedure expose gv. agenda. prune.
- parse arg input,output,agenda
-
- signal ON break_c
-
- out = gv.false
- if output ~= '' then do
- out = open(outputfile,output,'Write')
- if ~out then
- say 'unable to open output file :' output
- end
-
- if input = '-' then
- do
- do while ~eof(stdin)
- buffer = readln(stdin)
- SearchLine(buffer,gv.local_prefix,agenda)
- if out then
- writeln(outputfile,buffer)
- end
- end
- else
- do
- if open(inputfile,input,'Read') then
- do
- do while ~eof(inputfile)
- buffer = readln(inputfile)
- SearchLine(buffer,gv.local_prefix,agenda)
- if out then
- writeln(outputfile,buffer)
- end
- close(inputfile)
- end
- else
- say 'unable to open input file :' input
- end
-
- if out then do
- close(outputfile)
- end
- RETURN gv.true
-
- /*=======================================================================*/
- /* PrintUsage */
- /*=======================================================================*/
-
- PrintUsage:
- say 'GetURL' gv.version ': Written by James Burton (burton@cs.latrobe.edu.au)'
- say 'USAGE :'
- say ' rx geturl [<url>] [<options>]'
- say
- say '<url> can be'
- say ' http:[//]host[:port]/[path]'
- say
- say '<options> can be'
- say ' -Help display this message'
- say ' -Problem send message to me, bug report, query etc.'
- say ' -NewVersion <file> downloads most recent version of GetURL.rexx'
- say ' -PatternMatching downloads pattern matching utility'
- say ' -Associative use associative array rather than files'
- say ' -Delay <num> delay <num> seconds before next file'
- say
- say ' -Recursive recursively collect files, following contained URLs'
- /*say ' -Directory <dir> recursively collect URLs from text files in <dir>'*/
- /*say ' -Update <dir> recursively check files in <dir> for updates'*/
- say ' -NoProxy fetch directly from remote site, ignore proxy config'
- say ' -NoProxyCache force proxy to reload cache'
- say ' -Retry retry previously failed URLs (if -recursive set)'
- say
- say ' -Host <pattern> only collect files from hosts matching <pattern>'
- say ' -Path <pattern> only collect files from paths matching <pattern>'
- say ' -URL <pattern> only collect files from URLs matching <pattern>'
- say ' -Number <num> only collect <num> files'
- say ' -Length <num> only collect files smaller than <num> bytes'
- say ' -IfModified only collect files modified since cached'
- say ' -Depth <num> only collect files from <num> levels of recursion'
- say
- say ' -Input <filename> read input from file instead of <url>'
- say ' -Input read input from standard input instead of <url>'
- say ' -Output <filename> concatenate all files onto <filename>'
- say ' -SaveRoot <dir> save files under <dir> rather than Mosaic:'
- say ' -Visited <file> use specified visited file, won''t revisit these URLs'
- say ' -Unvisited <file> use specified unvisited file, visit these URLs first'
- say ' -Failed <file> use specified failed file, these ones did not transfer'
- say ' -SaveHeaders save transfer header to *.HDF'
- say
- say '<options> can all be abbreviated.'
- say
- RETURN gv.true
-
- /*=======================================================================*/
- /* Configuration */
- /*=======================================================================*/
-
- Configure: Procedure expose gv.
-
- /* look for HTTP Proxy server */
- if GetEnvVar('WWW_HTTP_GATEWAY') ~= '' then
- parse value GetEnvVar('WWW_HTTP_GATEWAY') with 'http://'gv.http_proxy_host':'gv.http_proxy_port'/'
- else if GetEnvVar('HTTP_PROXY') ~= '' then
- parse value GetEnvVar('HTTP_PROXY') with 'http://'gv.http_proxy_host':'gv.http_proxy_port'/'
- else
- gv.http_proxy_host = ''
-
- /* look for FTP Proxy server */
- if GetEnvVar('WWW_FTP_GATEWAY') ~= '' then
- parse value GetEnvVar('WWW_FTP_GATEWAY') with 'http://'gv.ftp_proxy_host':'gv.ftp_proxy_port'/'
- else if GetEnvVar('FTP_PROXY') ~= '' then
- parse value GetEnvVar('FTP_PROXY') with 'http://'gv.ftp_proxy_host':'gv.ftp_proxy_port'/'
- else
- gv.ftp_proxy_host = ''
-
- /* look for User's login name */
- if GetEnvVar('USER') ~= '' then
- user = GetEnvVar('USER')
- else
- do
- if GetEnvVar('LOGNAME') ~= '' then
- user = GetEnvVar('LOGNAME')
- else
- user = GetEnvVar('USERNAME')
- end
-
- /* look for User's real name */
- real = GetEnvVar('REALNAME')
-
- /* look for Machine's name */
- if GetEnvVar('HOSTNAME') ~= '' then
- host = GetEnvVar('HOSTNAME')
- else
- do
- if GetEnvVar('HOST') ~= '' then
- host = GetEnvVar('HOST')
- else
- host = GetEnvVar('NODENAME')||GetEnvVar('DOMAINNAME')
- end
-
- /* construct email address */
- gv.user_address = real '<'user'@'host'>'
-
- /* look for match program */
- address command 'which >env:result match'
- if GetEnvVar('result') ~= '' then
- do
- r = GetEnvVar('result')
- address command 'version >ENV:result' r 'full'
- r = GetEnvVar('result')
- r = compress(r,'0a'x)
- if r = 'Match 1.0 (16/01/95)(16-Jan-95) James Burton (burton@cs.latrobe.edu.au)' then
- gv.use_match = gv.true
- else
- gv.use_match = gv.false
- end
- else
- gv.use_match = gv.false
-
- /* make sure we have a valid timezone */
- if gv.timezone = '' then do
- say 'Since this is the first time you have run GetURL' gv.version
- say 'we need to know the Time Zone you are in relative to GMT (England time)'
- say 'Please enter the hrs & mins e.g. +11:00 (AEDT) or +10:30 (ACDT)'
- writech(stdout,'Enter TimeZone [+/-]<hr>:<mins> ')
- line = readln(stdin)
- parse value substr(line,2) with hr ':' mn
- sign = left(line,1)
- say 'TimeZone is' sign||hr':'mn
- gv.timezone = sign||hr||mn
- parse source dummy dummy dummy resolved_name dummy
- if open(f,resolved_name,'read') then
- do
- t_tmpname = 't:tmp-'tmpname()
- if open(fout,t_tmpname,'Write') then
- do
- i = 0
- do while ~eof(f)
- line = readln(f)
- if left(line,11) = 'gv.timezone' then
- do
- writeln(fout,'gv.timezone =' '"'gv.timezone'"')
- writech(stdout,'I')
- end
- else
- do
- writeln(fout,line)
- if (i // 30) = 0 then
- writech(stdout,'.')
- end
- i = i + 1
- end
- writeln(stdout,'<<')
- close(fout)
- /*address command 'copy' t_tmpname resolved_name*/
- address command 'copy' t_tmpname 't:GetURL-configured.rexx'
- address command 'delete >NIL:' t_tmpname
- say 'Please use t:GetURL-configured.rexx in future'
- end
- else
- say 'unable to open temp file'
- close(f)
- end
- else
- say 'unable to find the program file, please edit the timzone in by hand.'
- end
- RETURN gv.true
-
- /*=======================================================================*/
- /* OpenLibrary */
- /*=======================================================================*/
-
- OpenLibrary: Procedure expose gv.
- parse arg libname,strict
-
- ret = gv.true
-
- if ~show('Libraries',libname) then do
- if ~addlib(libname,0,-30,0) then do
- ret = gv.false
- if strict then do
- say 'unable to open ARexx library :' libname
- EXIT 20
- end
- end
- end
- RETURN ret
-
- /*=======================================================================*/
- /* GetEnvVar */
- /*=======================================================================*/
-
- GlobalGetEnv: Procedure expose gv.
- parse arg env_variable
-
- if open(varfile,'ENV:'||env_variable,'Read') then
- do
- line = readln(varfile)
- close(varfile)
- end
- else
- line = ''
- RETURN line
-
- GetEnvVar: Procedure expose gv.
- parse arg env_variable
-
- if show('Libraries','rexxarplib.library') then
- do
- r = getenv(env_variable)
- if right(r,1) = '0a'x then
- r = left(r,length(r)-1)
- RETURN r
- end
- else
- RETURN GlobalGetEnv(env_variable)
- end
- RETURN
-
- /*=======================================================================*/
- /* ParseUrl */
- /*=======================================================================*/
-
- ParseUrl: Procedure expose gv.
- parse arg ret_url
-
- signal ON break_c
- ok = gv.true
-
- parse var ret_url ret_method':'rest
- ret_method = upper(ret_method)
- select
- when ret_method = 'HTTP' then
- do
- /* set defaults for HTTP */
- ret_host = ''
- ret_port = 80
- ret_path = ''
-
- parse var rest '//'host_part'/'ret_path
- if host_part = '' then /* allow missing '//' */
- parse var rest host_part'/'ret_path
- parse var host_part ret_host':'ret_port
- parse var ret_path ret_path '#' dummy /* these can be safely ignored */
- if pos('?',ret_path) = 0 then
- ok = gv.true
- else
- ok = gv.false
- end
- when ret_method = 'FILE' then
- do
- /* set defaults for FILE */
- ret_host = ''
- ret_port = ''
- ret_path = ''
-
- parse var rest '//'host_part'/'ret_path
- if host_part = '' then
- parse var rest host_part'/'ret_path
- parse var host_part ret_host':'ret_port
-
- if (ret_host = 'localhost') | (ret_host = '') then
- ret_host = 'localhost'
- else
- ret_method = 'FTP'
- ok = gv.true
- end
- when ret_method = 'FTP' then
- do
- say 'FTP unimplemented'
- ok = gv.false
- end
- when ret_method = 'MAILTO' then
- do
- say 'MAILTO unimplemented'
- ok = gv.false
- end
- when ret_method = 'TELNET' then
- do
- say 'TELNET unimplemented'
- ok = gv.false
- end
- when ret_method = 'NEWS' then
- do
- say 'NEWS unimplemented'
- ok = gv.false
- end
- when ret_method = 'WAIS' then
- do
- say 'WAIS unimplemented'
- ok = gv.false
- end
- when ret_method = 'GOPHER' then
- do
- say 'GOPHER unimplemented'
- ok = gv.false
- end
- otherwise
- do
- /* not much more we can do if do not recognise the access method */
- say 'unknown HTTP Method :' ret_method
- ok = gv.false
- end
- end
- RETURN ok','ret_url','ret_method','ret_host','ret_port','ret_path
-
- /*=======================================================================*/
- /* TmpName */
- /*=======================================================================*/
-
- TmpName:
- RETURN date(days)"-"time(seconds)
-
- /*=======================================================================*/
- /* SendMail */
- /*=======================================================================*/
-
- SendMail: Procedure expose gv.
- parse arg openfile,to_address,subject
-
- parse var to_address to_login'@'to_host
- parse var gv.user_address real'<'from_login'@'from_host'>'
- socketname = 'tcp:'to_host'/smtp'
- end_mail = '2e'x
- send_timestamp = left(date('weekday'),3) date('normal') time('normal')
- message_id = tmpname()'@'from_host
-
- if open(socket,socketname,'Write') then
- do
- if ~CheckMailResult(socket) then call SendMailQuit
- writech(socket,'HELO' from_host||gv.crlf)
- if ~CheckMailResult(socket) then call SendMailQuit
- writech(socket,'MAIL FROM:' '<'from_login'@'from_host'>'||gv.crlf)
- if ~CheckMailResult(socket) then call SendMailQuit
- writech(socket,'RCPT TO:' '<'to_address'>'||gv.crlf)
- if ~CheckMailResult(socket) then call SendMailQuit
- writech(socket,'DATA'||gv.crlf)
- if ~CheckMailResult(socket) then call SendMailQuit
- writech(socket,'From' from_login'@'from_host send_timestamp||gv.crlf)
- writech(socket,'Received: by' from_host '(GetUrl.rexx' gv.version 'burton@cs.latrobe.edu.au)'||gv.crlf)
- writech(socket,'Message-Id:' '<'message_id'>'||gv.crlf)
- writech(socket,'Date:' send_timestamp '+0000'||gv.crlf)
- writech(socket,'From:' gv.user_address||gv.crlf)
- writech(socket,'To:' to_address||gv.crlf)
- writech(socket,'Subject:' subject||gv.crlf)
- writech(socket,'X-Mailer: GetURL.rexx' gv.version 'by James Burton (burton@cs.latrobe.edu.au)'||gv.crlf)
- writech(socket,gv.crlf)
- address command 'wait 1'
- do while ~eof(openfile)
- line = readln(openfile)
- if line = '.' then
- break
- writech(socket,line||gv.crlf)
- end
- writech(socket,end_mail||gv.crlf)
- if CheckMailResult(socket) then
- say 'Message sent OK.'
- SendMailQuit:
- writech(socket,'QUIT'||gv.crlf)
- close(socket)
- end
- else
- say 'could not open SMTP socket to host:' to_host
- RETURN gv.true
-
- CheckMailResult: Procedure expose gv.
- parse arg socket
-
- do while ~eof(socket)
- line = readln(socket)
- /*say '"'line'"'*/
- if substr(line,4,1) ~= '-' then
- break
- end
- status_code = left(line,3)
- /*say '['status_code']'*/
- if status_code > 399 then
- RETURN gv.false
- else
- RETURN gv.true
- RETURN gv.true
-
- /*=======================================================================*/
- /* AddToFailed */
- /* adds item to next failed */
- /*=======================================================================*/
-
- BTAddToFailed: Procedure expose gv. failed.
- parse arg item
-
- say 'AddToFailed ' item
- if failed.item = 0 then /* if not already known */
- do
- say 'accepted'
- next = failed.tail + 1 /* index for this item */
- failed.tail = next /* advance tail of failed */
- failed.next = item /* store item at tail */
- failed.item = 1
- end
-
- RETURN gv.true
-
- /*=======================================================================*/
- /* AddToPrune */
- /* adds item to next prune */
- /*=======================================================================*/
-
- BTAddToPrune: Procedure expose gv. prune.
- parse arg item
-
- say 'AddToPrune ' item
- if prune.item = 0 then /* if not already known */
- do
- say 'accepted'
- next = prune.tail + 1 /* index for this item */
- prune.tail = next /* advance tail of prune */
- prune.next = item /* store item at tail */
- prune.item = 1
- end
-
- RETURN gv.true
-
- /*=======================================================================*/
- /* AddToAgenda */
- /* adds item to tail of agenda */
- /*=======================================================================*/
-
- BTAddToAgenda: Procedure expose gv. agenda.
- parse arg item
-
- say 'AddToAgenda' item
- say agenda.item
- if (agenda.item ~= 0) then say 'already known'
- if (index(item,'#') ~= 0) then say 'is a # reference'
- /* if not already known and not an internal reference */
- if (agenda.item = 0) & (index(item,'#') = 0) then
- do
- say 'accepted'
- next_depth = gv.depth + 1 /* next recursion level */
- next = agenda.tail + 1 /* index for this item */
- agenda.tail = next /* advance tail of agenda */
- agenda.next = item /* store item at tail */
- agenda.item = next_depth /* and store depth at item */
- end
-
- RETURN gv.true
-
- /*=======================================================================*/
- /* AgendaTop */
- /* return the topmost (first) item in the agenda */
- /* and remove it from the agenda */
- /*=======================================================================*/
-
- BTAgendaTop: Procedure expose gv. agenda.
-
- say 'AgendaTop'
- next = agenda.head /* location of next item */
- say 'next ' next
- item = agenda.next /* get it */
- say 'item ' item
- if (item ~= 0) then
- do
- gv.depth = agenda.item /* set new depth */
- say 'depth ' gv.depth
- agenda.next = 0 /* drop it from agenda */
- agenda.item = 0 /* and forget its depth */
- agenda.head = next + 1 /* and advance the head */
- end
- else
- do
- item = ''
- end
- say 'AgendaTop ' item
-
- RETURN item
-
- /*=======================================================================*/
- /* AddFileTo */
- /* load an array from a file */
- /*=======================================================================*/
-
- BTAddFileTo: Procedure expose gv. agenda. prune. failed.
- parse arg array,filename
-
- say 'AddFileTo ' array filename
- /* performance shouldnt be an issue here */
-
- if open(filep, filename, 'Read') then
- do
- do while ~eof(filep)
- buffer = readln(filep)
- if (buffer ~= '') then
- select
- when (array = prune) then
- AddToPrune(buffer)
- when (array = agenda) then
- AddToAgenda(buffer)
- when (Array = failed) then
- AddToFailed(buffer)
- otherwise
- nop
- end /* select */
- end /* while */
- close(filep)
- end /* if */
- else
- do
- say 'AddFileTo ' array ' could not read ' filename
- end /* else */
-
- say 'AddFileTo exits'
- RETURN gv.true
-
- /*=======================================================================*/
- /* DumpAgendaTofile */
- /* unload agenda to a file */
- /*=======================================================================*/
-
- BTDumpAgendaToFile: Procedure expose gv. agenda.
- parse arg filename
-
- say 'DumpAgendaToFile ' filename
- /* note: we deliberately do NOT allow this to effect the global 'depth' */
-
- depth = 1
- if open(filep, filename, 'Write') then
- do
- buffer = AgendaTop()
- do while (buffer ~= '' )
- writeln(filep, buffer)
- buffer = AgendaTop()
- end /* while */
- agenda.head = 1
- agenda.tail = 0
- close(filep)
- end /* if */
- else
- do
- say 'DumpAgendaToFile could not write to ' filename
- end /* else */
-
- say 'DumpAgendaToFile exits '
- RETURN gv.true
-
- /*=======================================================================*/
- /* DumpFailedToFile */
- /* unload failed to a file */
- /*=======================================================================*/
-
- BTDumpFailedToFile: procedure expose gv. failed.
- parse arg filename
-
- say 'DumpFailedToFile ' filename
- if open(filep, filename, 'Write') then
- do
- i = 1
- buffer = failed.i
- do while (buffer ~= '0' )
- writeln(filep, buffer)
- failed.i = 0
- failed.buffer = 0
- i = i + 1
- buffer = failed.i
- end /* while */
- failed.tail = 0
- close(filep)
- end /* if */
- else
- do
- say 'DumpFailedToFile could not write to ' filename
- end /* else */
-
- say 'DumpFailedToFile exits'
- RETURN gv.true
-
- /*=======================================================================*/
- /* End of File 'geturl.rexx' */
- /*=======================================================================*/
-